loadPkg("xtable")
loadPkg("kableExtra")
loadPkg("stringi")
loadPkg("readr")
loadPkg("dplyr")
loadPkg("ggplot2")
loadPkg("countrycode")
loadPkg("reshape")
loadPkg("corrplot")
loadPkg("tidyr")
loadPkg("readxl")


xkabledply = function(modelsmmrytable, title="Table", digits = 4, pos="left", bso="striped", wide=FALSE) { 
  #' Combining base::summary, xtable, and kableExtra, to easily display model summary. 
  #' wrapper for the base::summary function on model objects
  #' Can also use as head for better display
  #' ELo 202004 GWU DATS
  #' version 1.2
  #' @param modelsmmrytable This can be a generic table, a model object such as lm(), or the summary of a model object summary(lm()) 
  #' @param title Title of table. 
  #' @param digits Number of digits to display
  #' @param pos Position of table, c("left","center","right") 
  #' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
  #' @param wide print table in long (FALSE) format or wide (TRUE) format
  #' @return HTML table for display
  #' @examples
  #' library("xtable")
  #' library("kableExtra")
  #' xkabledply( df, title="Table testing", pos="left", bso="hover" )
  #' xkabledply( ISLR::Hitters[1:5,] )
  if (wide) { modelsmmrytable <- t(modelsmmrytable) }
  modelsmmrytable %>%
    xtable() %>% 
    kable(caption = title, digits = digits) %>%
    kable_styling(bootstrap_options = bso, full_width = FALSE, position = pos)
}

xkabledplyhead = function(df, rows=5, title="Head", digits = 4, pos="left", bso="striped") { 
  xkabledply(df[1:rows, ], title, digits, pos, bso, wide=FALSE)
}

xkabledplytail = function(df, rows=5, title="Tail", digits = 4, pos="left", bso="striped") { 
  trows = nrow(df)
  xkabledply(df[ (trows-rows+1) : trows, ], title, digits, pos, bso, wide=FALSE)
}

xkablesummary = function(df, title="Table: Statistics summary.", digits = 4, pos="left", bso="striped") { 
  #' Combining base::summary, xtable, and kableExtra, to easily display numeric variable summary of dataframes. 
  #' ELo 202004 GWU DATS
  #' version 1.2
  #' @param df The dataframe.
  #' @param title Title of table. 
  #' @param digits Number of digits to display
  #' @param pos Position of table, c("left","center","right") 
  #' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
  #' @return The HTML summary table for display, or for knitr to process into other formats 
  #' @examples
  #' xkablesummary( faraway::ozone )
  #' xkablesummary( ISLR::Hitters, title="Five number summary", pos="left", bso="hover"  )
  
  s = summary(df) %>%
    apply( 2, function(x) stringr::str_remove_all(x,c("Min.\\s*:\\s*","1st Qu.\\s*:\\s*","Median\\s*:\\s*","Mean\\s*:\\s*","3rd Qu.\\s*:\\s*","Max.\\s*:\\s*")) ) %>% # replace all leading words
    apply( 2, function(x) stringr::str_trim(x, "right")) # trim trailing spaces left
  
  colnames(s) <- stringr::str_trim(colnames(s))
  
  if ( dim(s)[1] ==6 ) { rownames(s) <- c('Min','Q1','Median','Mean','Q3','Max') 
  } else if ( dim(s)[1] ==7 ) { rownames(s) <- c('Min','Q1','Median','Mean','Q3','Max','NA') }
  
  xkabledply(s, title=title, digits = digits, pos=pos, bso=bso )
}

xkablevif = function(model, title="VIFs of the model", digits = 3, pos="left", bso="striped", wide=TRUE) { 
  #' Combining faraway::vif, xtable, and kableExtra, to easily display numeric summary of VIFs for a model. 
  #' ELo 202004 GWU DATS
  #' version 1.2
  #' @param model The lm or compatible model object.
  #' @param title Title of table. 
  #' @param digits Number of digits to display
  #' @param pos Position of table, c("left","center","right") 
  #' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
  #' @param wide print table in long (FALSE) format or wide (TRUE) format
  #' @return The HTML summary table of the VIFs for a model for display, or for knitr to process into other formats 
  #' @examples
  #' xkablevif( lm(Salary~Hits+RBI, data=ISLR::Hitters), wide=T )
  
  vifs = table( names(model$coefficients)[2:length(model$coefficients)] ) # remove intercept to set column names
  vifs[] = faraway::vif(model) # set the values
  if (wide) { vifs <- t(vifs) }
  xkabledply( vifs, title=title, digits = digits, pos=pos, bso=bso )
}

1. Introduction

1.1 Motivation and Research Objectives

Climate change is one of the most widely researched and heavily debated topics of the 21st century, as it not only impacts human systems and the biosphere, but also the hydrosphere, cryosphere, and lithosphere.

In the past decade alone, carbon dioxide concentrations have risen above 400 parts per million (ppm) for the first time in recorded history (Mikhaylov et al., 2020). Rising land temperatures as well as increased frequency of natural disasters (drought, wildfires, flooding, etc,) are reflective of these changes in atmospheric carbon dioxide concentrations. Furthermore, some regions, such as the Arctic, are disproportionately affected by climate change, with mean land surface temperatures warming at twice the global rate in that region (Overland et al., 2017). Therefore, it is important to not only know about these trends, but also be able to predict these trends for the future.

In project 1, the data analysis focused on examining global surface temperature trends from 1743 to 2013. The goal of project 2 is to find a more optimal model to predict surface temperatures and predict what these trends imply for the future.

1.2 Previous Research

While this topic has been extensively researched, many climate change studies are focused on specific climate feedbacks such as permafrost thaw or Arctic sea ice decline. To add to the already expansive framework of climate change, this project provides a broader analysis of temperature change on a global scale over the past two centuries.

In project 1, a combination of boxplots, histograms, and line graphs were used to explore global and regional trends of surface temperature as well as temperature change between decades and centuries. The histogram of global average surface temperature from 1743 to 2013 yielded a mean of 17.2 on an overall left skewed distribution. This histogram encouraged further analysis since it confirmed that global temperatures fall into an average level of approximately 17℃ or 63℉. A value too high or too low would require reassessment of the analysis on the dataset, since humans require livable temperatures. The Q-Q plot for average surface temperature supported the conclusions drawn by the histogram, as it also suggested a left skewed distribution, with values peaking towards the middle left of the graph.

In the boxplot of surface temperatures across regions, it was observed that the highest temperatures were found in Oceania and Africa, followed by the Americas and Asia, with Europe reporting the coldest temperatures. The line plot highlighted the individual trends of temperature for all regions. This graph especially answers the first SMART question: are surface temperatures changing over time? For all regions previously stated in the boxplot analysis, noticeable temperature increases were observed most notably beginning in the early-to-mid 1800s. Even Oceania, for which the boxplot analysis showed to have some of the highest temperatures among all regions, temperatures increased throughout the mid 1800s to 2000s. The most dramatic rates of temperature increase were observed in the Americas and Asia. For other regions such as Europe, temperatures steadily increased albeit at a slower rate.

Additionally, an ANOVA test was used to analyze if surface temperatures were in fact different among regions. To compare centuries, specifically the 1900s and 2000s, a Welch two sample t-test was employed. The conclusion from this two sample t-test indicated that surface temperatures did increase in the 2000s while it did not increase in the 1990s. To conclude, an initial model was built to test the correlation between all numeric variables.

2. Datasets

2.1 Origin

To meet these aims, this project explored regional and global records of temperature using open source earth surface temperature data from Kaggle and the World Bank.

The first dataset is sourced from Kaggle and contains average surface temperature by country from 1743 to 2013. This dataset consists of more than 544,811 observations and contains four variables: year, average temperature, average temperature uncertainty, and the country in which these temperatures were recorded.

The second dataset is sourced from the World Bank and contains data points by country from 1990 to 2013. This dataset consists of 6,336 observations for each of the following categories: CO2 emissions, renewable energy consumptions (%), population total, energy use, forest area, fossil fuels, electricity access, water withdrawal, pollution exposure and urban population

2.2 Limitations

The expansive nature of the datasets presented several limitations for analysis. With values ranging from 1743 at the earliest to present times at the latest, the surface temperature dataset specifically needed cleaning up and sorting before it could be used for proper exploratory data analysis (EDA).

The first step in rendering the data usable to answer the SMART questions was to create three new variables within the surface temperature data set, including region, century, and decade. Afterwards, the century and decade variables were converted into factor format so as to allow the temperatures to be grouped into certain time periods. The final step in formatting this dataset was to drop the missing values. In a dataset ranging from 1743 to present, there were many missing values that can be traced to a lack of substantial population. For example, many values prior to the 1800s in South America were noticeably void as the region had not yet been colonized.

In the World Bank data sets, there were not as many missing values or formatting issues that needed to be resolved, although the data did require reformatting to be used for the intended purposes for the project. For this project, the categories of CO2 emissions, renewable energy consumptions (%), population total, energy use, and forest area were to be used as predictor variables in the model. To enable EDA using this dataset, the data sets were formatted from wide to long form so that each row would count as a singular observation. Afterwards, data outside of the 1990 to 2013 range was dropped to allow the five variables to be merged together for cleaner processing of the predictor variables in our model. Because some data such as renewable energy use were not measured until the 1990s, this time range was chosen to ensure consistency within the analysis.

3. SMART Questions

Based on the conclusions from project 1, the analysis for the second half of this project will be expanded using feature selection and time series in order to build the optimal model for predicting surface temperatures and to look into future surface temperature trends. To achieve these goals, our Specific, Measurable, Achievable, Realistic, and Timely (SMART) questions are as follows: 1. What are the temperature trends by region? 2. What is the optimal model for predicting surface temperatures? 3. What are the predicted temperature trends over the next century?

4. Analysis

Step 1

Import the data

global_temps <- data.frame(read_csv("Datasets/GlobalTemp.csv"))
names(global_temps)[1] <- 'Year'
global_temps$Year <- as.integer(format(global_temps$Year, format="%Y"))
global_temps$Decade <- (global_temps$Year - global_temps$Year %% 10)
global_temps$Century <- (global_temps$Year - global_temps$Year %% 100)
global_temps

Step 2

Analyze temperature data

temp_histogram <- ggplot(global_temps, aes(x=AverageTemperature)) +
  geom_histogram(color="dark blue", fill= "lightblue") +
  labs(title ="Histogram of Global Average Surface Temperature", subtitle="1743 - 2013") +
  xlab("Average Surface Temperature") +
  ylab("Frequency")
temp_histogram

temp_qq <- qqnorm(global_temps$AverageTemperature, main= "Average Surface Temperature: Normal Q-Q Plot"); qqline(global_temps$AverageTemperature, col="blue")

temp_qq

summary(global_temps)
sd(global_temps$AverageTemperature)

Step 2 Results: Histogram

The histogram of global average surface temperature from 1743 to 2013 yielded expected results, with a mean of 17.2 on an overall left skewed distribution. This histogram allowed further analysis since it confirmed that global temperatures fall into an average level of approximately 17 degrees C or 63 degrees F. A value too high or too low would require reassessment of the dataset, since humans require livable temperatures.

Step 3

Analyze global trends by decade and century

global_temps <-  na.omit(global_temps)
global_temps$Century <- as.factor(global_temps$Century)
global_temps$Decade <- as.factor(global_temps$Decade)

century_temp_data <- global_temps %>% group_by(Century, Year) %>%
  summarize(mean(AverageTemperature))

#line graph of overall trend
century_trends_line_plot <- ggplot() +
  geom_line(data=century_temp_data, aes(x=Year, y=`mean(AverageTemperature)`, color=factor(Century))) +
  xlab("Year") +
  ylab("Average Temperature (celcius)") +
  labs(title="Surface Temperature Trends Over Time", color="Century")
century_trends_line_plot

#boxplot by century
century_year_temp_data_boxplot <- ggplot(century_temp_data, aes(x=as.factor(century_temp_data$Century), y=`mean(AverageTemperature)`, color=Century)) + 
  geom_boxplot() +
  labs(title="Boxplot of Global Surface Temperatures by Century") +
  xlab("Century") +
  ylab("Average Temperature (celcius)") +
  coord_cartesian(ylim = c(0, 20))
century_year_temp_data_boxplot

Step 3 Results: Line and Box Plot

Using the line graph and box plot, it can be determined that mean surface temperatures are increasing with every decade. This is consistent with the general consensus that the climate has been warming over time.

Step 4

Hypothesis Tests

#are 1900s and 2000s statistically different?

#subset data
global_temps_1900s <- subset(global_temps, global_temps$Century == "1900")
global_temps_2000s <- subset(global_temps, global_temps$Century == "2000")
century_ttest <- t.test(global_temps_1900s$AverageTemperature, global_temps_2000s$AverageTemperature, alternative = "greater", conf.level = 0.95)
century_ttest

Step 4 Results: T-Test, Are 1900s and 2000s statistically different?

Using a two-sample t-test, differences between average surface temperatures in the 1900s and 2000s were analyzed. The two-sample t-test yielded a p-value of [1]. Due to the very low p-value, the null hypothesis is rejected implying that the temperature in 1900s and 2000s are slightly different in that there is a slight increase from 1900s to 2000s.

Step 5: Begin Model Building

*Data Cleaning for Linear Regression

#format dataset to single observation by country per year
summary_temp_data <-  global_temps %>% group_by(Year, Country) %>% 
summarize(mean(AverageTemperature))

#reformat World Bank datasets from wide to long (years 1990-2013)
forest_area <- data.frame(read_xls("Datasets/forest_area.xls"))
forest_area <- subset(forest_area, select = -c(2:34))
forest_area <- subset(forest_area, select = -c(26:32))
colnames(forest_area) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999",
                           "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010",
                           "2011", "2012", "2013")
forest_area_long <- gather(forest_area, Year, Forest_Area, 2:25, factor_key = TRUE)

renewable_energy_pct <- data.frame(read_xls("Datasets/renewable_energy.xls"))
renewable_energy_pct <- subset(renewable_energy_pct, select = -c(2:34))
renewable_energy_pct <- subset(renewable_energy_pct, select = -c(26:32))
colnames(renewable_energy_pct) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998",  "1999","2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013")
renewable_energy_pct_long <- gather(renewable_energy_pct, Year, Renew_Energy_Pct, 2:25, factor_key=TRUE)

population <- data.frame(read_xls("Datasets/population.xls"))
population <- subset(population, select = -c(2:34))
population <- subset(population, select = -c(26:32))
colnames(population) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998",  "1999","2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013")
population_long <- gather(population, Year, Population_Total, 2:25, factor_key=TRUE)

energy_use <- data.frame(read_xls("Datasets/energy_use.xls"))
energy_use <- subset(energy_use, select = -c(2:34))
energy_use <- subset(energy_use, select = -c(26:32))
colnames(energy_use) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998",  "1999","2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013")
energy_use_long <- gather(energy_use, Year, Energy_Use, 2:25, factor_key=TRUE)

Co2_emissions <- data.frame(read_xls("Datasets/C02_emissions.xls"))
Co2_emissions <- subset(Co2_emissions, select = -c(2:34))
Co2_emissions <- subset(Co2_emissions, select = -c(26:32))
colnames(Co2_emissions) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998",  "1999","2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013")
co2_emissions_long <- gather(Co2_emissions, Year, Co2_Emissions, 2:25, factor_key = TRUE)

#merge World Bank Data sets
climate_change1 <- merge(forest_area_long, renewable_energy_pct_long, by=c("Country", "Year"))
climate_change2 <- merge(climate_change1, population_long, by=c("Country", "Year"))
climate_change3 <- merge(climate_change2, energy_use_long, by=c("Country", "Year"))
climate_change <- merge(climate_change3, co2_emissions_long, by=c("Country", "Year"))

#merge with temperature data set
summary_temp_data <- subset(summary_temp_data, summary_temp_data$Year >= 1990)
colnames(summary_temp_data) <- c("Year", "Country", "Temperature")

temp_data <- merge(climate_change, summary_temp_data, by=c("Country", "Year"))

Step 6

Linear Regression

#correlation plot
#remove categorical variables for plot
temp_data_cor <- subset(temp_data, select=-c(1,2))
temp_data_cor[is.na(temp_data_cor)] = 0
tempcor <- cor(temp_data_cor)
corrplot(tempcor, method = "number", type="upper")

#linear model ##Note: Model variables are selected based on variables closer to 1 and -1
model1 <- lm(Temperature ~ Renew_Energy_Pct + Energy_Use + Co2_Emissions + Forest_Area  + Population_Total, data=temp_data)
summary(model1)

#rerun without population total
model2 <- lm(Temperature ~ Renew_Energy_Pct + Energy_Use + Co2_Emissions + Forest_Area, data=temp_data)
summary(model2)

faraway::vif(model2)

Step 6 Results: Model Building

To begin model building, correlation between all numerical variables (forest area, renewable energy percentage, population totals, energy use, CO2 emissions, and temperatures) were examined using a correlation matrix. Based on this plot matrix, all variables could be included in the model. However, it is necessary to consider the correlations between energy usage and CO2 in reality, as environments with higher energy usage tend to have higher CO2 outputs that can cause a rise in temperature as seen in the U.S., China, and India.

The following variables are significant at the 99% confidence level: renewable energy percentage, energy use, CO2 emissions and forest area. The only variable found insignificant by the model was population total.

The model was revised to include only the statistically significant variables mentioned above, omitting the population total variable. At the 99% confidence level, renewable energy percentage, energy use, CO2 emissions and forest area were found to be statistically significant. A check of the VIF values for the revised model indicated possible multicollinearity as energy use and CO2 emissions returned values over 5. The values, however, were not deemed high enough to remove them from the model.

As such, the equation for our final model read that: Surface temperature (Celsius) = 15.70 + 0.116 * Renew_Energy_Pct - 0.0026 * Energy_Use + 0.9250* CO2_Emissions - 0.0000012* Forest_Area

Where the variable units for renewable energy percent resulted from the percentage of total energy usage and energy use equals kilogram of oil equivalent per capita. CO2 emissions are reported in metric tons per capita and forest area in square kilometers. The R Squared for the revised model is 23.5% .

Pull Additional Indicators for Feature Selection

#electricity access (% of population)
electricity <- data.frame(read_xls("Datasets/electricity.xls"))
electricity <- subset(electricity, select = -c(2:34))
electricity <- subset(electricity, select = -c(26:32))
colnames(electricity) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999",
                           "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010",
                           "2011", "2012", "2013")
electricity_long <- gather(electricity, Year, Electricity_Access, 2:25, factor_key=TRUE)

#annual freshwater withdrawals
water <- data.frame(read_xls("Datasets/water.xls"))
water <- subset(water, select = -c(2:34))
water <- subset(water, select = -c(26:32))
colnames(water) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999",
                           "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010",
                           "2011", "2012", "2013")
water_long <- gather(water, Year, Water_withdrawal, 2:25, factor_key=TRUE)

#pollution explosure
pollution <- data.frame(read_xls("Datasets/pollution.xls"))
pollution <- subset(pollution, select = -c(2:34))
pollution <- subset(pollution, select = -c(26:32))
colnames(pollution) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999",
                           "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010",
                           "2011", "2012", "2013")
pollution_long <- gather(pollution, Year, Pollution_Exp, 2:25, factor_key=TRUE)

#urban population
urban <- data.frame(read_xls("Datasets/urban.xls"))
urban <- subset(urban, select = -c(2:34))
urban <- subset(urban, select = -c(26:32))
colnames(urban) <- c("Country", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999",
                           "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010",
                           "2011", "2012", "2013")
urban_long <- gather(urban, Year, Urban_Pop, 2:25, factor_key=TRUE)

#merge with existing indicators
climate_change4 <- merge(electricity_long, water_long, by=c("Country", "Year"))
climate_change5 <- merge(climate_change4, pollution_long, by=c("Country", "Year"))
climate_change6 <- merge(climate_change5, urban_long, by=c("Country", "Year"))
temp_data2 <- merge(climate_change6, temp_data, by=c("Country", "Year"))


#merge World Bank Data sets
climate_change1 <- merge(forest_area_long, renewable_energy_pct_long, by=c("Country", "Year"))
climate_change2 <- merge(climate_change1, population_long, by=c("Country", "Year"))
climate_change3 <- merge(climate_change2, energy_use_long, by=c("Country", "Year"))
climate_change <- merge(climate_change3, co2_emissions_long, by=c("Country", "Year"))

#merge with temperature data set
summary_temp_data <- subset(summary_temp_data, summary_temp_data$Year >= 1990)
colnames(summary_temp_data) <- c("Year", "Country", "Temperature")

temp_data <- merge(climate_change, summary_temp_data, by=c("Country", "Year"))

Step 7

Regional Linear Regression - Africa

#correlation plot
#remove categorical variables for plot
temp_data$Region <- countrycode(sourcevar = temp_data[, "Country"],origin = "country.name",destination = "continent")
africa_temp_data_cor <- subset(temp_data ,temp_data$Region == 'Africa',select=-c(1,2,9))
africa_temp_data_cor[is.na(africa_temp_data_cor)] = 0
africa_tempcor <- cor(africa_temp_data_cor)
corrplot(africa_tempcor, method = "number", type="upper")

#linear model ##Note: Model variables are selected based on variables closer to 1 and -1
africa_model <- lm(Temperature ~ Energy_Use + Co2_Emissions + Forest_Area , data=subset(temp_data ,temp_data$Region == 'Africa'))
summary(africa_model)

#faraway::vif(model1)

For the Africa region, the following variables are significant at the 99% confidence level: renewable energy percentage and forest area. The variables found insignificant by the model was energy use and CO2 emissions. The R Squared is 22.9%.

Step 8

Regional Linear Regression - Americas

#correlation plot
#remove categorical variables for plot
temp_data$Region <- countrycode(sourcevar = temp_data[, "Country"],origin = "country.name",destination = "continent")
americas_temp_data_cor <- subset(temp_data ,temp_data$Region == 'Americas',select=-c(1,2,9))
americas_temp_data_cor[is.na(americas_temp_data_cor)] = 0
americas_tempcor <- cor(americas_temp_data_cor)

corrplot(americas_tempcor, method = "number", type="upper")

#linear model ##Note: Model variables are selected based on variables closer to 1 and -1
americas_model <- lm(Temperature ~ Energy_Use + Co2_Emissions  + Forest_Area+ Population_Total  + Renew_Energy_Pct , data=subset(temp_data ,temp_data$Region == 'Americas'))
summary(americas_model)

#faraway::vif(model1)

For the Americas region, all variables are significant at the 99% confidence level. The R Squared is 76.2 % .

Step 9

Regional Linear Regression - Asia

#correlation plot
#remove categorical variables for plot
temp_data$Region <- countrycode(sourcevar = temp_data[, "Country"],origin = "country.name",destination = "continent")
asia_temp_data_cor <- subset(temp_data ,temp_data$Region == 'Asia',select=-c(1,2,9))
asia_temp_data_cor[is.na(asia_temp_data_cor)] = 0
asia_tempcor <- cor(asia_temp_data_cor)
corrplot(asia_tempcor, method = "number", type="upper")

#linear model ##Note: Model variables are selected based on variables closer to 1 and -1
asia_model <- lm(Temperature ~ Energy_Use + Forest_Area+Renew_Energy_Pct , data=subset(temp_data ,temp_data$Region == 'Asia'))
summary(asia_model)

#faraway::vif(model1)

For the Asia region, all variables are significant at the 99% confidence level. The R Squared is 17.3% .

Step 10

Regional Linear Regression - Europe

#correlation plot
#remove categorical variables for plot
temp_data$Region <- countrycode(sourcevar = temp_data[, "Country"],origin = "country.name",destination = "continent")
europe_temp_data_cor <- subset(temp_data ,temp_data$Region == 'Europe',select=-c(1,2,9))
europe_temp_data_cor[is.na(europe_temp_data_cor)] = 0
europe_tempcor <- cor(europe_temp_data_cor)
corrplot(europe_tempcor, method = "number", type="upper")

#linear model ##Note: Model variables are selected based on variables closer to 1 and -1
europe_model <- lm(Temperature ~ Energy_Use +Renew_Energy_Pct + Co2_Emissions + Population_Total, data=subset(temp_data ,temp_data$Region == 'Europe'))
summary(europe_model)

#faraway::vif(model1)

For the Europe region, the following variables are significant at the 99% confidence level: renewable energy percentage and CO2 emissions. The variables found insignificant by the model was energy use and forest area. The R Squared is 19.1% .

Step 11

Regional Linear Regression - Oceania

#correlation plot
#remove categorical variables for plot
temp_data$Region <- countrycode(sourcevar = temp_data[, "Country"],origin = "country.name",destination = "continent")
oceania_temp_data_cor <- subset(temp_data ,temp_data$Region == 'Oceania',select=-c(1,2,9))
oceania_temp_data_cor[is.na(oceania_temp_data_cor)] = 0
oceania_tempcor <- cor(oceania_temp_data_cor)
corrplot(oceania_tempcor, method = "number", type="upper")

#linear model ##Note: Model variables are selected based on variables closer to 1 and -1
oceania_model <- lm(Temperature ~ Energy_Use +Renew_Energy_Pct + Co2_Emissions + Population_Total + Forest_Area, data=subset(temp_data ,temp_data$Region == 'Oceania'))
summary(oceania_model)

#faraway::vif(model1)

For the Oceania region, the following variables are significant at the 99% confidence level: renewable energy percentage, energy use, and CO2 emissions. Forest area is significant at the 95% confidence level. The R Squared is 98.4% .

Interpretation of Linear Models by Region

While the linear models for the Americas and Asia agreed that all variables were statistically significant, they displayed dramatically different R-squared values. A possible interpretation is that the R-squared value for the Americas was higher than that of Asia because of a higher intensity in the predictor variables. For example, while CO2 emissions were significant for both regions, the Americas potentially emitted more, giving this predictor a higher influence on the final R-squared.

Oceania, Europe and Africa all agreed that renewable energy percentage was significant at a 99% confidence level, although only CO2 emissions were found significant at the same level in Oceania and Europe. In Africa, CO2 emissions and energy use were found insignificant. As renewable energy use was a significant variable in Asia and the Americas, it is logical that it also contributed to our final calculation of surface temperature for the other regions as well. Based on this, we can conclude that renewable energy use will be a significant variable in determining surface temperature regardless of the region.

As for energy use and CO2 emissions being insignificant in Africa, this might be correlated to low emissions and energy use by the region. Here, it is possible that CO2 is not emitted at a high enough intensity to contribute to the individual region’s surface temperature trends, and not that CO2 and energy use are insignificant variables.

Oceania, Africa and Europe display similar disparities in R-squared values to those between the Americas and Asia. In Europe the model had the lowest R-squared values, followed by Africa and then Oceania with the largest R-squared values. Europe having the lowest R-squared value is somewhat unexpected given that it had an industrial revolution and CO2 emissions similar to that seen in the Americas. We believe that the R-squared value is so low because Europe had the greatest time span of the regions in the dataset. While the time record for the Americas begins post-colonization and essentially at the start of the industrial revolution, Europe had almost an entire century of no emissions or significant energy use that reduces the magnitude of these predictor variables.

Step 12

Rebuild Linear Regression with new variables

#correlation plot
#remove categorical variables for plot
temp_data_cor2 <- subset(temp_data2, select=-c(1,2,13))
temp_data_cor2[is.na(temp_data_cor2)] = 0
tempcor2 <- cor(temp_data_cor2)
corrplot(tempcor2, method = "number", type="upper")

#linear model ##Note: Model variables are selected based on variables closer to 1 and -1
model1 <- lm(Temperature ~ Energy_Use + Renew_Energy_Pct  + Co2_Emissions + Forest_Area +Pollution_Exp  + Population_Total, data=temp_data2)
summary(model1)

model2 <- lm(Temperature ~ Energy_Use + Renew_Energy_Pct  + Co2_Emissions +Pollution_Exp  + Population_Total, data=temp_data2)
summary(model2)

To optimize the original model, a correlation plot was built and variables with values closest to 1 and -1 were used for the new equation. Using an exhaustive method, adjusted R-squared, BIC, and Cp were used as selection criteria. At the 99% confidence level, all three selection criteria determined population and electricity access the most significant, where R-squared equaled 33.4%. The new model read that:

Surface Temperature (Celsius) = 21.84 + 0.159* Pollution_exposure - 0.125 * Electricity_ access - 0.0014* Energy_use + 0.065* Urban_population + 0.322*Co2_emissions

The R-squared value of this model was 0.334 or 33.4%.

Step 13

Feature Selection (Adjusted R^2)

loadPkg("leaps")
temp_data2
temp_data_clean2 = temp_data2[ , c("Forest_Area", "Renew_Energy_Pct", "Population_Total", "Energy_Use", "Co2_Emissions","Temperature","Electricity_Access","Water_withdrawal","Pollution_Exp","Urban_Pop")] 

adj_selection <- regsubsets(Temperature~., data = temp_data_clean2, nbest = 2, method = "exhaustive") 
plot(adj_selection, scale = "adjr2", main = "Adjusted R^2")

adj_selection_summary = summary(adj_selection)
adj_selection_summary$which[8,]
adj_selection_summary

adj_model <- lm(Temperature ~ Pollution_Exp + Electricity_Access + Energy_Use +  Urban_Pop + Co2_Emissions , data = temp_data_clean2)
summary(adj_model)
faraway::vif(adj_model)

Step 14

Feature Selection (BIC)

bic_selection <- regsubsets(Temperature~., data = temp_data_clean2, nbest = 2, method = "exhaustive") 
plot(bic_selection, scale = "bic", main = "BIC")

bic_selection_summary = summary(bic_selection)
bic_selection_summary$which[8,]
bic_selection_summary

bic_model <- lm(Temperature ~ Pollution_Exp + Electricity_Access + Energy_Use +  Urban_Pop + Co2_Emissions , data = temp_data_clean2)
summary(bic_model)
faraway::vif(bic_model)

Step 15

Feature Selection (Cp)

cp_selection <- regsubsets(Temperature~., data = temp_data_clean2, nbest = 2, method = "exhaustive") 
plot(cp_selection, scale = "Cp", main = "Cp")

cp_selection_summary = summary(cp_selection)
cp_selection_summary$which[8,]

cp_model <-  lm(Temperature ~ Pollution_Exp + Electricity_Access + Energy_Use +  Urban_Pop + Co2_Emissions , data = temp_data_clean2)
faraway::vif(cp_model)

Interpretation of Optimized Model

After using feature selection to compare models, it was found that the model Surface Temperature (Celsius) = 21.84 + 0.159* Pollution_exposure - 0.125 * Electricity_ access - 0.0014* Energy_use + 0.065* Urban_population + 0.322Co2_emissions was a more optimal model with a R-squared of 0.334 or 33.4%, which was an improvement from the preliminary linear model of Surface temperature (celsius) = 15.70 + 0.116 Renew_energy_pct - 0.0026 * Energy_use + 0.9250* Co2_emissions - 0.0000012* Forest_area, which had an R-Squared of 0.235 or 23.5%.

Global Time Series Analysis

#create summary dataset
loadPkg("dplyr")
summary_temps <- global_temps %>% group_by(Year) %>%
  summarize(mean(AverageTemperature))
colnames(summary_temps) <- c("Year","Temperature")

#time series of temperatures from 1743
#ts1 <- ts(data = summary_temps$Temperature, start=c(1743), frequency=1)
# plot.ts(ts1, type="l", col="blue", lwd =2, 
#         ylab="Annual Surface Temperature (Celcius)", xlab="Year") +
#   title("Annual Global Surface Temperatures (1743-2013)") +
#   abline(h=0, col="darkorange2", lwd=2) +
#   text(2000, -0.1, "1743-2013 average")


#subset for data from 1900
summary_temps_1900 <- subset(summary_temps, summary_temps$Year >= 1900)
ts2 <- ts(data = summary_temps_1900$Temperature, start=c(1900), frequency=1)
plot.ts(ts2, type ="l", col="blue", lwd=2,
        ylab="Annual Surface Temperature (Celcius)", xlab="Year") +
  title("Annual Global Surface Temperature (1900-2013)")

loadPkg("forecast")
loadPkg("zoo")
model1 <- auto.arima(as.zoo(ts2))
forecast(model1, 40)
plot1 <- plot(forecast(model1, 40), main = "Global Temperature Predictions until 2053", xlab = "Year", ylab = "Average Temperature (Celcius)")

United States Analysis

US_temps <- subset(global_temps, global_temps$Country == "United States")

summary_US_temps <- US_temps %>% group_by(Year) %>%
  summarize(mean(AverageTemperature))
colnames(summary_US_temps) <- c("Year","Temperature")

US_temps_1900 <- subset(summary_US_temps, summary_US_temps$Year >= 1900)

# ts_us <- ts(data = summary_US_temps$Temperature, start=c(1768), frequency=1)
# ts_us_plot <- plot.ts(ts_us, type ="l", col="blue", lwd=2,
#         ylab="Annual Surface Temperature (Celcius)", xlab="Year") +
#   title("Annual Global Surface Temperature in United States (1768-2013)")

ts_us2 <- ts(data = US_temps_1900$Temperature, start=c(1900), frequency=1)
ts_us_plot2 <- plot.ts(ts_us2, type ="l", col="blue", lwd=2,
        ylab="Annual Surface Temperature (Celcius)", xlab="Year") +
  title("Annual Global Surface Temperature in United States (1900-2013)")

loadPkg("forecast")
loadPkg("zoo")
model2 <- auto.arima(as.zoo(ts_us2))
forecast(model2, 40)
plot2 <- plot(forecast(model2, 40), main = "United States Temperature Predictions until 2053", xlab = "Year", ylab = "Average Temperature (Celcius)")

Iceland Analysis

Iceland_temps <- subset(global_temps, global_temps$Country == "Iceland")

summary_Iceland_temps <- Iceland_temps %>% group_by(Year) %>%
  summarize(mean(AverageTemperature))
colnames(summary_Iceland_temps) <- c("Year","Temperature")

Iceland_temps_1900 <- subset(summary_Iceland_temps, summary_Iceland_temps$Year >= 1900)

ts_ice <- ts(data = Iceland_temps_1900$Temperature, start=c(1900), frequency=1)
ts_ice_plot <- plot.ts(ts_ice, type ="l", col="blue", lwd=2,
        ylab="Annual Surface Temperature (Celcius)", xlab="Year") +
  title("Annual Global Surface Temperature in Iceland (1900-2013)")

loadPkg("forecast")
loadPkg("zoo")
model3 <- auto.arima(as.zoo(ts_ice))
forecast(model3, 40)
plot3 <- plot(forecast(model3, 40), main = "Iceland Temperature Predictions until 2053", xlab = "Year", ylab = "Average Temperature (Celcius)")

Haiti Analysis

Haiti_temps <- subset(global_temps, global_temps$Country == "Haiti")

summary_haiti_temps <- Haiti_temps %>% group_by(Year) %>%
  summarize(mean(AverageTemperature))
colnames(summary_haiti_temps) <- c("Year","Temperature")

haiti_temps_1900 <- subset(summary_haiti_temps, summary_haiti_temps$Year >= 1900)

ts_haiti <- ts(data = haiti_temps_1900$Temperature, start=c(1900), frequency=1)
ts_haiti_plot <- plot.ts(ts_haiti, type ="l", col="blue", lwd=2,
        ylab="Annual Surface Temperature (Celcius)", xlab="Year") +
  title("Annual Global Surface Temperature in Haiti (1900-2013)")

loadPkg("forecast")
loadPkg("zoo")
model4 <- auto.arima(as.zoo(ts_haiti))
forecast(model4, 40)
plot4 <- plot(forecast(model4, 40), main = "Haiti Temperature Predictions until 2053", xlab = "Year", ylab = "Average Temperature (Celcius)")

Lastly, time series analysis was used to answer the third SMART question. The first time series analysis looked at the global scale and summarized temperatures from 1743-2013 and then forecasted predictions up to the year 2053. To further explore trends over the next century, specific countries were selected from different regions, namely the United States of America, Haiti and Iceland. For these individual time series, temperatures from 1900-2013 were summarized and predictions were forecasted up to the year 2053.

Interpretation of Time Series Analysis

The time series analysis projected surface temperature to rise globally for the period up to 2053. For the United States and Haiti, the time series analysis also forecasted rising temperatures. Iceland, however, did not have any conclusive trend upwards or downwards in predicted temperatures. Because this refutes general climate trends, the original dataset was examined for any errors within the recordings for Iceland. Although there were not any obvious missing records or alarming entries, there is most likely some error causing the discrepancy between published trends and the data analysis for this project.

Conclusions

SMART Question 1: What are the temperature trends by region? From 1743 to 2013, the Americas experienced the greatest temperature change. The highest mean surface temperatures were found in Oceania and Africa, followed by the Americas and Asia, with the coldest mean temperatures occurring in Europe. There were also noticeable temperature increases most notably beginning in the early-to-mid 1800s for all regions. Lastly, surface temperatures were found to be different across all regions.

SMART Question 2: What is the optimal model for predicting surface temperatures? After using feature selection to compare models, it was found that the second model based on newly selected features was more optimal than our original model. The new model had an R-squared of 33.4% while the original had an R-squared of 23.5%. Given the complexity of climate and temperature, we believe that this is a very well performing model.

SMART Question 3: What are the predicted temperature trends over the next century? Surface temperatures are projected to rise globally until 2053. This is consistent with the predictions of many climate models. Surface temperatures were also projected to rise in the United States and Haiti. Based on the results for this project, there does not seem to be a projected rise or fall in surface temperatures for Iceland. This finding is interesting and contradictory to what is generally expected. Based on the Third Report on Impacts of Climate Change in Iceland from the Iceland Meteorological Office, there is a projected 1.34–2.10°C warming depending on RCP scenario (2.6 or 8.5) in the middle of this century and a 1.50–4.10°C warming for the last decade of the 21st century (Third Report on Impacts of Climate Change in Iceland). Perhaps, there were errors in the Iceland dataset or the timeframe used in the time series analysis did not allow for the correct projection to be made.

References

Luyssaert, S., Schulze, E. D., Börner, A., Knohl, A., Hessenmöller, D., Law, B. E., … & Grace, J. (2008). Old-growth forests as global carbon sinks. Nature, 455(7210), 213-215.

Mikhaylov, A., Moiseev, N., Aleshin, K., & Burkhardt, T. (2020). Global climate change and greenhouse effect. Entrepreneurship and Sustainability Issues, 7(4), 2897.

Overland, J. E., Hanna, E., Hanssen-Bauer, I., Kim, S. J., Walsh, J. E., Wang, M., Bhatt, U.S., & Thoman, R. L. (2017). Surface air temperature. Arctic report card.

Third Report on Impacts of Climate Change in Iceland. (n.d.). Retrieved May 05, 2021, from https://en.vedur.is/climatology/iceland/climate-report